################################################################
#Script: MultipleImputation_DBP.R
################################################################
#Project: Gov 2001 Project; Do Businesses Pay to Do Science?; Extension of Giuri P. and Mariani M., "When Distance Disappears", 2013
#Script Goal: Load PatVal-EU data, fix minor discrepancies, create multiple imputation datasets
#Inputs: 
#-'data_complete.dta'
#dta_complete.dta can be retreived from emailing the authors of "When Distance Disappears"
#Outputs: Multiple Imputed Data Files in .csv format
###############################################################
rm(list=ls())

##Packages
#install.packages("Amelia")
#install.packages("foreign")
#install.packages("pastecs")
#install.packages("boot")

library(Amelia)
library(foreign)
library(pastecs)
library(boot)

##Working Directory, Temp Folder
root<-"ENTER YOUR WORKING DIRECTORY HERE"
setwd(root)

##Set Seed
set.seed(02127)

##create imputed data directory
tempDir <- "WIP"
if (file.exists(tempDir)){
} else {
  dir.create(file.path(root, tempDir))
}


tempDir <- "WIP/ImputedData"
if (file.exists(tempDir)){
} else {
  dir.create(file.path(root, tempDir))
}

##Read in data
df.main <- read.dta("data_complete.dta")

################################################################
#Dealing with Missing and Erroneous Data Correction
################################################################
##These individuals were assigned two applicant flags. We are assuming 'Individual_Applic' is correct for these patent observations given the absence of firm level data
df.main[1499,"PRI_Applic"] <- 0
df.main[3549,"Firm_Applic"] <- 0
df.main[3714,"Firm_Applic"] <- 0
df.main[7408,"Firm_Applic"] <- 0

## Removing obs with high missingness based on missing intotprob (inverse total probability)
df.main <- df.main[-which(is.na(df.main$invtotprob)),]

## Replacing "" with NA in nuts2_d_res_country
for (i in 1:length(df.main[,1])) {
  if (df.main[i,"nuts2_d_res_country"]=="") {
    df.main[i,"nuts2_d_res_country"]=NA
  }
}

## Replacing "" with NA in nuts3_d_res
for (i in 1:length(df.main[,1])) {
  if (df.main[i,"nuts3_d_res"]=="") {
    df.main[i,"nuts3_d_res"]=NA
  }
}

## Replacing 0--> NAs indicated by variable 'dum_miss_reg_mob' in data (NAs correctly identify missingness)
for (i in 1:length(df.main[,1])) {
  if (df.main[i,"dum_miss_reg_mob"]==1) {
    df.main[i,"reg_mob_nuts3In"]=NA
    df.main[i,"reg_mob_nuts3Out"]=NA
    }
}

## Replacing 0-->NAs in variable 'RDint' (R&D Intensity) indicated by 'D_missC_RD' (NAs correctly identify missingness)
for (i in 1:length(df.main[,1])) {
  if (df.main[i,"D_missC_RD"]==1) {
    df.main[i,"RDint"]=NA
  }
}

##Replacing NAs in Employees
for (i in 1:length(df.main[,1])) {
  if (df.main[i,"D_miss_empl"]==1) {
    df.main[i,"EmployeesMiss"]=NA
    df.main[i,"lEmployeesMiss"]=NA
  }
}


## Code used to identify bad NA and assignment to multiple categories, which is corrected above
for (i in 1:length(df.main[,1])) {
  if (is.na(df.main[i,"Firm_Applic"])==TRUE || is.na(df.main[i,"PRI_Applic"])==TRUE || is.na(df.main[i,"Individual_Applic"])==TRUE)  {
    if (is.na(df.main[i,"Firm_Applic"])==FALSE || is.na(df.main[i,"PRI_Applic"])==FALSE || is.na(df.main[i,"Individual_Applic"])==FALSE) {
      print("broad error")
      print(i)
    }
  }
}

## Make factor var 'Applicant' from Firm_Applic, PRI_Applic, Individual_Applic
df.main[,"Applicant"] <- rep(NA,length(df.main[,1])) 
for (i in 1:length(df.main[,1])) {
  if (is.na(df.main[i,"Firm_Applic"])==FALSE || is.na(df.main[i,"PRI_Applic"])==FALSE) {
    if (df.main[i,"Firm_Applic"]==1) {
      if (df.main[i,"PRI_Applic"]==1) {
        print("Error Firm PRI: ")
        print(i)
      }
    }
  }
  
  if (is.na(df.main[i,"Firm_Applic"])==FALSE || is.na(df.main[i,"Individual_Applic"])==FALSE) {
    if (df.main[i,"Firm_Applic"]==1) {
      if (df.main[i,"Individual_Applic"]==1) { 
          print("Error Firm Individ:")
          print(i)
      }
    }
  }
  
  if (is.na(df.main[i,"PRI_Applic"])==FALSE || is.na(df.main[i,"Individual_Applic"])==FALSE) {
    if (df.main[i,"PRI_Applic"]==1) {
      if (df.main[i,"Individual_Applic"]==1) { 
        print("Error PRI Individ:")
        print(i)
      }
    }
  }
 
  if (is.na(df.main[i,"Firm_Applic"])==FALSE) {
    if (df.main[i,"Firm_Applic"]==1) {
      df.main[i,"Applicant"]="Firm"
    }
  }
  
  if (is.na(df.main[i,"PRI_Applic"])==FALSE) {
    if (df.main[i,"PRI_Applic"]==1) {
      df.main[i,"Applicant"]="PRI"
    }
  }
  
  if (is.na(df.main[i,"Individual_Applic"])==FALSE) {
    if (df.main[i,"Individual_Applic"]==1) {
      df.main[i,"Applicant"]="Individual"
    }
  }
}

## Make factor var 'Location' from Country-based dummies
df.main[,"Location"] <- rep(NA,length(df.main[,1])) 
for (i in 1:length(df.main[,1])) {
  if (is.na(df.main[i,"UK"])==FALSE) {
    if (df.main[i,"UK"]==1) {
      df.main[i,"Location"]="UK"
    }
  }

  if (is.na(df.main[i,"DE"])==FALSE) {
    if (df.main[i,"DE"]==1) {
      df.main[i,"Location"]="DE"
    }
  }
  
  if (is.na(df.main[i,"IT"])==FALSE) {
    if (df.main[i,"IT"]==1) {
      df.main[i,"Location"]="IT"
    }
  }
  
  if (is.na(df.main[i,"ES"])==FALSE) {
    if (df.main[i,"ES"]==1) {
      df.main[i,"Location"]="ES"
    }
  }
  
  if (is.na(df.main[i,"NL"])==FALSE) {
    if (df.main[i,"NL"]==1) {
      df.main[i,"Location"]="NL"
    }
  }
}

################################################################
#Multiple Imputation of Missing Data Using Amelia II
################################################################
#Amelia II - Preprogrammed imputation
#James Honaker, Gary King, Matthew Blackwell (2011). Amelia II: A Program for Missing Data. Journal of Statistical Software, 45(7), 1-47. URL http://www.jstatsoft.org/v45/i07/.
#For errors with Amelia, contact James Honaker

## Drop variables which will be backed out via transformation and calculations after running Amelia
drops <- c("CE_dummy2","DE_dummy2","age2","Low_High_School","UniMasDegree","PhDDegree","experience","zeroExp","AppYear1993","AppYear1994","AppYear1995","AppYear1995","AppYear1996","AppYear1997","AppYear1998","dum_miss_reg_mob","EmployeesMiss","D_missC_RD","D_miss_empl","gdppop_nuts3","pop_nuts3","area_nuts3_km2","av_pat_nuts3_9496","sharepat_nuts2_tc30_1","TechClass1","PhDXCoinv_d","UniXCoinv_d","PhDXScience","UniXScience", "Individ_Applic", "Firm_Applic", "PRI_Applic","Individual_Applic","UK","DE","ES","IT","NL")
df.main <- df.main[,!(names(df.main) %in% drops)]

## Define factor Variables
df.main$nuts2_d_res_country <- factor(df.main$nuts2_d_res_country)
df.main$nuts3_d_res <- factor(df.main$nuts3_d_res)
df.main$TechClass <- factor(df.main$TechClass)
df.main$Applicant <- factor(df.main$Applicant)

## Define ID Variables and other variables generally withheld because of collinearity issues in the data (a common issue due to the limited scope of the data with many numerour factor (10+) variables) 
idvars=c("id","id_parent_1","year_first_patent_resc3", "no_pastcollab",
         #variables causing colinearity
         "nuts3_d_res","reg_mob_nuts3Out","nuts2_d_res_country",
         #and to resolve issues with contrasts / errors in Amelia due to unidentified collinearity
         "AppYear","invtotprob","TechClass","Applicant")

## Define variables that must be classified as Nominal, Ordinal
noms=c("EDU","Location")
ords=c("CloseExt","DistExt")

## Define Number of Multiple Imputation Datasets
num.datasets <- 10

## Run Amelia
a.out <- amelia(df.main,m=num.datasets,ords=ords,noms=noms,idvars=idvars)
write.amelia(obj=a.out,file.stem="outdata")

## In each MI dataset: Recalculate variables based on indicator and natural log transformations that were earlier dropped 
for (i in 1:num.datasets) {
  data <- read.csv(file=paste("outdata",i,".csv",sep=""))
  data <- data[2:length(data[1,])]
  data[,"experience"] <- exp(data$lexperience)-1
  data[,"zeroExp"] <- rep(NA,length(data[,1]))
  for (j in 1:length(data[,1])) {
    if (data[j,"experience"]==0) {data[j,"zeroExp"]=0} else {data[j,"zeroExp"]=1}
  } 
  data[,"age2"] <- data$Age*data$Age
  data[,"EmployeesMiss"] <- exp(data$lEmployeesMiss)-1
  data[,"gdppop_nuts3"] <- exp(data$lgdppop_nuts3)-1
  data[,"pop_nuts3"] <- exp(data$lpop_nuts3)-1
  data[,"area_nuts3_km2"] <- exp(data$larea_nuts3_km2)-1
  data[,"av_pat_nuts3_9496"] <- exp(data$lav_pat_nuts3_9496)-1
  data[,"sharepat_nuts2_tc30_1"] <- exp(data$lsharepat_nuts2_tc30_1)-1
  write.csv(data, file=paste("WIP/ImputedData/outdata",i,".csv",sep=""))
  unlink(paste("outdata",i,".csv",sep=""))
}
